Common functions for hanlding temperatures

Includes:

  • Normal calculation functions
  • Degree-day calculation functions

In [1]:
# import sys
# import os
# import glob

import numpy as np
import scipy.interpolate
import pandas as pd

# import matplotlib as mpl
# import matplotlib.pyplot as plt
# from mpl_toolkits.mplot3d import Axes3D
# import mpl_toolkits.axisartist
# from mpl_toolkits.axes_grid1 import host_subplot

# import dateutil

# import statsmodels.formula.api as smformula

# from collections import OrderedDict

# from IPython.display import display, HTML
# %matplotlib notebook
# plt.style.use('seaborn-paper')

# special-ish extensions

Simple converters


In [2]:
def tempF2C(x):
    return (x-32.0)*5.0/9.0
def tempC2F(x):
    return (x*9.0/5.0)+32.0

Functions for computing normals

Example usage: given tempdf is a dataframe or series of hourly temperature values:

tempnorm, tempresamp = compute_year_over_year_norm(tempdf,
                                                   START_DATE, END_DATE,
                                                   norm_end=END_DATE,
                                                   freq='hourly',
                                                   interp_method='linear')

In [2]:
def compute_year_over_year_norm(in_dataframe,
                                start, end,
                                norm_start=None, norm_end=None,
                                freq='daily',
                                interp_method='linear',
                                quiet=False):
    """
    Will propagate the timezone of in_dataframe.index to the return values
    
    Parameters
    ----------
    start: convertable to Datetime
        start range of dates to output
    end: convertable to Datetime
        end range of dates to output
    norm_start : convertable to Datetime or None
        `None` will use in_dataframe.index[0]
    norm_end : convertable to Datetime or None
        if given (not None), output range does not include `norm_end` (it is half-open)
        `None` will use in_dataframe.index[-1]
    freq : {'daily', 'hourly'}
    interp_method : str or None
        `None` will skip resample and interpolation, so 
        `in_dataframe` must already be daily or hourly (depending on `freq`)!
    
    Returns
    -------
    norm, resampled_in_dataframe
    
    Example
    -------
    tempnorm, tempresamp = compute_year_over_year_norm(tempdf,
                                                   START_DATE, END_DATE,
                                                   norm_end=END_DATE,
                                                   freq='hourly',
                                                   interp_method='linear',
                                                   quiet=False)
    """
    if freq == 'hourly':
        hrs = 24
        hrs_freq = '1h'
    elif freq == 'daily':
        hrs = 1
        hrs_freq = '24h'
    else:
        raise ValueError("Invalid `freq` argument value: {}".format(freq))
    if norm_start is None:
        norm_start = in_dataframe.index[0]
    else:
        norm_start = pd.to_datetime([norm_start])[0].tz_localize(in_dataframe.index.tz)# - pd.Timedelta('1 second')
    if norm_end is None:
        norm_end = in_dataframe.index[-1]
    else:
        norm_end = pd.to_datetime([norm_end])[0].tz_localize(in_dataframe.index.tz)# - pd.Timedelta('1 second')
    
    if not quiet:
        print('Computing using range:', norm_start, 'to', norm_end)
    
    if interp_method is None: # skip resample+interpolation (assumes in_dataframe is daily!)
        t = in_dataframe.loc[norm_start:norm_end]
    else: # resample and interpolate to get hourly
        # pandas resample: simple but might require upsampling in some cases
        # t = in_dataframe.resample(hrs_freq).interpolate(method=interp_method).loc[norm_start:norm_end]
        # using scipy.interpolate.interp1d: seems safer / more reliable than pandas resample
        nidx = pd.date_range(norm_start, norm_end, freq=hrs_freq)
        if isinstance(in_dataframe, pd.DataFrame):
            t = pd.DataFrame([], index=nidx)
            for col in in_dataframe:   
                t[col] = scipy.interpolate.interp1d(in_dataframe.index.astype('int64').values, 
                                                    in_dataframe[col].values, 
                                                    kind=interp_method, 
                                                    fill_value=np.nan, #(0,1) 
                                                    bounds_error=False)(t.index.astype('int64').values)
        elif isinstance(in_dataframe, pd.Series):
            t = pd.Series(scipy.interpolate.interp1d(
                                        in_dataframe.index.astype('int64').values, 
                                        in_dataframe.values, 
                                        kind=interp_method,
                                        fill_value=np.nan, #(0,1) 
                                        bounds_error=False)(nidx.astype('int64').values),
                          index=nidx)

        else:
            assert False, "in_dataframe must be a pandas.DataFrame or pandas.Series"
            

    # actual norm calculation
    norm = t.groupby([t.index.month, t.index.day, t.index.hour]).mean().sort_index()

    # now replicate and trim to the desired output range
    start = pd.to_datetime(start).tz_localize(in_dataframe.index.tz)
    end = pd.to_datetime(end).tz_localize(in_dataframe.index.tz)
    
    # need a non-leapyear and leapyear version
    norm_ly = norm.copy()
    if norm.shape[0] == 366*hrs:
        norm = norm.drop((2,29,))
    else: # norm doesn't include any leapyear data
        assert norm.shape[0] == 365*hrs
        # make Feb 29 the mean of Feb 28 and Mar 1
        foo = (norm.loc[(2,28,)] + norm.loc[(3,1,)]) / 2.0
        foo.index = pd.MultiIndex.from_product( ([2],[29],list(range(hrs))) )
        norm_ly = pd.concat((norm_ly,foo)).sort_index()
        norm_ly.sort_index(inplace=True) # probably not needed

    # build up a 'long normal' (lnorm) dataframe year by year by appending the norm or norm_ly
    lnorm = None
    for yr in np.arange(start.year, end.year+1):
        idx = pd.date_range(start='{}-{:02d}-{:02d} {:02d}:00:00'.format(yr,*norm.index[0]),
                            end=  '{}-{:02d}-{:02d} {:02d}:00:00'.format(yr,*norm.index[-1]),
                            freq=hrs_freq,
                            tz=in_dataframe.index.tz)
        if idx.shape[0] == 366*hrs:
            foo = norm_ly.copy()
        else:
            assert norm.shape[0] == 365*hrs
            foo = norm.copy()
        foo.index = idx
        if lnorm is None:
            lnorm = foo
        else:
            lnorm = lnorm.append(foo)

    return lnorm.loc[start:end], t

In [ ]:

Claculate degree days and threshold crossings

Functions for computing Degree-day

Compute the number of degree days (DD) for each day in the temperature file


In [4]:
# Used internally
def compute_threshold_passings(ddvals, cumulative_DD_threshold):
    """Find the index where cumsum of ddvals starting at each index passes the cumulative_DD_threshold
    ddvals needs to be a pandas Series or DataFrame column"""
    crossing_idx_rv = np.zeros(len(ddvals))*np.nan
    cDD = ddvals.cumsum(skipna=True)
    crossing_idx_rv = np.searchsorted(cDD, cDD+cumulative_DD_threshold-ddvals, side='left').astype(float)
    crossing_idx_rv[crossing_idx_rv==len(crossing_idx_rv)] = np.nan 
    #crossing_idx_rv[np.array(pd.isnull(ddvals),dtype=bool)] = np.nan  # @TCC Does not propagate nan correctly
    return crossing_idx_rv #, cdd_rv

In [5]:
def _ddvals_to_outdf(dd, dd_threshold):
    """ used internally
    prepare nice output dataframe from ddvals
    """
    # Find the point where cumulative sums of degree days cross the threshold
    dd['idx'] = compute_threshold_passings(dd['DD'], dd_threshold)
    # convert those indexes into end times
    i = dd['idx'] # just convinience
    e = pd.Series(index=dd.index, dtype='datetime64[ns]')
    e[i.notnull()] = dd.index[i[i.notnull()].astype(int)]
    dd['end'] = e
    # and duration... 
    dd['dur'] = dd['end']-dd.index+pd.Timedelta(days=1)
    dd['dur_days'] = dd['dur'].apply(lambda x: np.nan if pd.isnull(x) else x.days)
    return dd

In [ ]:


In [6]:
def compute_summation_DD_from_hourly_df(temperature_df, base_temp, dd_threshold):
    """hourly summation degree-day method
    to compute when a cumulative threshold has been passed 
    given hourly temperature data.
    
    NOTE: Will actually work with non-hourly and non-equally-spaced data, 
    but accuracy will be lower for lower frequency and does not handle readings which cross midnight properly.
    """
    # so we don't accidently mess up the input dataframe/series
    t = pd.DataFrame(data=temperature_df.copy(deep=True), index=temperature_df.index)
    # difference in datatime in units of (float) hours
    t['dDays'] = t.index
    t['dDays'] = t['dDays'].diff().dt.total_seconds() / (24*60*60)
    # temp above threshold (or 0 if below)
    t['dAT'] = t['AT']-DD_BASE_TEMP
    t.loc[t['dAT'] < 0, 'dAT'] = 0
    # thermal accumulation for each time period
    t['DD'] = t['dAT']*t['dDays']
    # compute the degree-days for each day
    dd = t['DD'].groupby(pd.TimeGrouper('D')).sum().to_frame(name='DD')
    return _ddvals_to_outdf(dd, dd_threshold)

In [7]:
def compute_single_triangle_DD_from_hourly_df(temperature_df, base_temp, dd_threshold):
    """single triangle degree-day method
    to compute when a cumulative threshold has been passed 
    given hourly temperature data.
    """
    # so we don't accidently mess up the input dataframe/series
    t = pd.DataFrame(data=temperature_df.copy(deep=True), index=temperature_df.index)

    # compute the degree-days for each day in the temperature input
    dfgrp = t.groupby(pd.TimeGrouper('D'))
    dd = dfgrp.agg(lambda x: _compute_daily_SingleTri_DD_from_hourly_groupby(x, base_temp))
    dd.columns = ['DD']
    return _ddvals_to_outdf(dd, dd_threshold)

# Used internally
def _compute_daily_SingleTri_DD(mint, maxt, avet, base_temp):
    """Use standard Baskerville-Ermin (single sine) degree-day method
    to compute the degree-day values for each a single day.
    """
    dd = np.nan # value which we're computing
    # Adjust for observation time; not relevant
    # if max < base (curve all below base)
    if maxt <= base_temp:
        dd = 0
    # min <= base; tri from max to base_temp
    elif mint <= base_temp:
        dd = (maxt-base_temp)*(maxt-base_temp)*(0.5/(maxt-mint))
    # else whole tri extends above; tri from max to min + rect from min to base
    else:
        assert mint > base_temp
        dd = (maxt-mint)/2.0 + (mint-base_temp)
    return dd

def _compute_daily_SingleTri_DD_from_hourly_groupby(grp, base_temp):
    if grp.isnull().values.any():
        return np.nan
    mint = np.min(grp)
    maxt = np.max(grp)
    avet = (mint+maxt)/2.0 # simple midpoint (like in the refs)
    assert not np.isnan(mint)
    return _compute_daily_SingleTri_DD(
                            mint=mint,
                            maxt=maxt,
                            avet=avet,
                            base_temp=base_temp)

In [8]:
def compute_BM_DD_from_hourly_df(temperature_df, base_temp, dd_threshold):
    """Use standard Baskerville-Ermin (single sine) degree-day method
    to compute when a cumulative threshold has been passed 
    given hourly temperature data.
    (though its value is computed using daily min & max temperatures)
    """
    # so we don't accidently mess up the input dataframe/series
    t = pd.DataFrame(data=temperature_df.copy(deep=True), index=temperature_df.index)

    # compute the degree-days for each day in the temperature input
    dfgrp = t.groupby(pd.TimeGrouper('D'))
    dd = dfgrp.agg(lambda x: _compute_daily_BM_DD_from_hourly_groupby(x, base_temp))
    dd.columns = ['DD']
    return _ddvals_to_outdf(dd, dd_threshold)

def compute_BM_DD_from_daily_min_and_max(mint, maxt, base_temp, dd_threshold):
    dd = pd.DataFrame({'minT':mint, 'maxT':maxt}, index=mint.index)
    dd['DD'] = pd.concat([mint,maxt], axis=1).apply(lambda x: 
                    _compute_daily_BM_DD(x[0], x[1], (x[0]+x[1])/2.0, DD_BASE_TEMP),
                    axis=1)
    return _ddvals_to_outdf(dd, dd_threshold)

# Used internally
def _compute_daily_BM_DD(mint, maxt, avet, base_temp):
    """Use standard Baskerville-Ermin (single sine) degree-day method
    to compute the degree-day values for each a single day.
    """
    dd = np.nan # value which we're computing
    # Step 1: Adjust for observation time; not relevant
    # Step 2: GDD = 0 if max < base (curve all below base)
    if maxt < base_temp:
        dd = 0
    # Step 3: Calc mean temp for day; already done previously
    # Step 4: min > base; then whole curve counts
    elif mint >= base_temp:
        dd = avet - base_temp
    # Step 5: else use curve minus part below base
    else:
        W = (maxt-mint)/2.0
        tmp = (base_temp-avet) / W
        if tmp < -1:
            print('WARNING: (base_temp-avet)/W = {} : should be [-1:1]'.format(tmp))
            tmp = -1
        if tmp > 1:
            print('WARNING: (base_temp-avet)/W = {} : should be [-1:1]'.format(tmp))
            tmp = 1
        A = np.arcsin(tmp)
        dd = ((W*np.cos(A))-((base_temp-avet)*((np.pi/2.0)-A)))/np.pi
    return dd

def _compute_daily_BM_DD_from_hourly_groupby(grp, base_temp):
    mint = np.min(grp)
    maxt = np.max(grp)
    avet = (mint+maxt)/2.0 # simple midpoint (like in the refs)
    return _compute_daily_BM_DD(
                            mint=mint,
                            maxt=maxt,
                            avet=avet,
                            base_temp=base_temp)

In [ ]:


In [9]:
## Cython version of compute_threshold_passings
## might be slightly faster (numpy.searchsorted is pretty good though)
## However, makes portablility more complicated
# %load_ext Cython

In [10]:
# %%cython
# # Used internally
# import numpy as np
# cimport numpy as np
# import cython
# from libc.math cimport isnan

# def compute_threshold_passings(double[:] ddvals, double cumulative_DD_threshold):
#     cdef double cDD
#     cdef int found
#     cdef double[:] crossing_idx_rv = np.zeros(len(ddvals))*np.nan
#     #cdef double[:] cdd_rv = np.zeros(len(ddvals))*np.nan

#     # @TCC -- Probably don't need to do this for all possible start dates... 
#     for i in range(len(ddvals)):
#         cDD = 0
#         found = 0
#         for j in range(i,len(ddvals)):
#             #print(start_d.Index, d.Index, cDD)
#             if isnan(ddvals[i]):
#                 crossing_idx_rv[i] = np.nan
#                 found = 1 # we might find more later
#                 break
#             else:
#                 cDD += ddvals[j]
#                 if cDD > cumulative_DD_threshold:
#                     found = 1
#                     crossing_idx_rv[i] = j
#                     #dd_rv[i] = cDD # @TCC Not really needed
#                     break
#         if found != 1: # won't find any more either
#             break

#     return crossing_idx_rv #, cdd_rv

In [ ]:


In [ ]:


In [ ]:


In [ ]: